# imports..
from sklearn.datasets import load_boston
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
import warnings
warnings.simplefilter('ignore')
# Gather Data....and Explore
boston_dataset=load_boston()
type(boston_dataset)
boston_dataset
dir(boston_dataset)
print(boston_dataset.DESCR)
print(boston_dataset.data)
print(boston_dataset.feature_names)
print(boston_dataset.filename)
print(boston_dataset.target)
type(boston_dataset.data)
boston_dataset.data.shape
data = pd.DataFrame(data = boston_dataset.data, columns = boston_dataset.feature_names)
data
data['PRICE'] = boston_dataset.target
data.head()
data.tail()
data.count()
pd.isnull(data)
pd.isnull(data).any()
data.info()
plt.hist(data['PRICE'])
plt.show()
plt.hist(data['PRICE'], bins = 30)
plt.show()
plt.figure(figsize= (10,6))
plt.hist(data['PRICE'], bins =50, ec = 'black', color = '#800000', alpha = 0.7)
plt.xlabel('Price in 1000\'s')
plt.ylabel('Nr of Houses')
plt.show()
sns.distplot(data['PRICE'])
plt.show()
plt.figure(figsize= (10,6))
sns.distplot(data['PRICE'], color = 'red', bins = 50)
plt.show()
plt.figure(figsize= (10,6))
sns.distplot(data['PRICE'], color = 'red', bins = 50, hist=False)
plt.show()
plt.figure(figsize= (10,6))
sns.distplot(data['PRICE'], color = 'red', bins = 50,kde=False )
plt.show()
plt.figure(figsize= (10,6))
plt.hist(data['RM'], ec = 'black', color = 'pink')# removed bins
plt.xlabel('Average Number of Rooms')
plt.ylabel('Nr of Houses')
plt.show()
data['RM'].mean()
plt.figure(figsize= (10,6))
plt.hist(data['RAD'], ec = 'black', color = 'green')
plt.xlabel('Accessibility to Highways')
plt.ylabel('Nr of Houses')
plt.show()
data['RAD'].value_counts()
plt.figure(figsize= (10,6))
plt.hist(data['RAD'], ec = 'black', bins = 24, color = 'green')
plt.xlabel('Accessibility to Highways')
plt.ylabel('Nr of Houses')
plt.show()
accessibility = data['RAD'].value_counts()
accessibility.values
print(accessibility.index)
type(accessibility)
plt.figure(figsize= (10,6))
plt.bar(accessibility.index, accessibility )
plt.show()
plt.figure(figsize= (10,6))
plt.bar(accessibility.index, accessibility, color ='brown', ec = 'blue', width = 0.5)# changing the width, color and ec
plt.xlabel('Accessibility index to Highways')
plt.ylabel('Nr of Houses')
plt.show()
data['CHAS'].value_counts()
data['PRICE'].min()
data['PRICE'].max()
data['PRICE'].mean()
data['PRICE'].median()
data.min()
data.max()
data.mean()
data.median()
data.describe()
# Correlation
data['PRICE'].corr(data['RM'])
data['PRICE'].corr(data['PTRATIO'])
data.corr()
# Heatmap
type(data.corr())
plt.figure(figsize =(16,10))
sns.heatmap(data.corr(), annot = True, annot_kws ={'size':14} )
plt.show()
mask : boolean array or DataFrame, optional
If passed, data will not be shown in cells where mask is True.
Cells with missing values are automatically masked.
masker = np.zeros_like(data.corr())
masker
type(masker)
triangle_indices = np.triu_indices_from(masker)
triangle_indices
masker[triangle_indices] = True
masker
plt.figure(figsize =(16,10))
sns.heatmap(data.corr(), annot = True, annot_kws ={'size':14}, mask = masker )
plt.xticks(fontsize = 10)
plt.yticks(fontsize = 10)
plt.show()
nox_dis_corr = round(data['NOX'].corr(data['DIS']), 3)
plt.figure(figsize = (16, 8))
plt.title (f'Dist from Employment Centres vs Pollution , Corr ({nox_dis_corr})',
fontsize = 25, color = 'green')
plt.scatter(x=data['DIS'], y =data['NOX'], alpha = 0.5, color = 'red', s= 80)
plt.xlabel( 'Distance from Employment Centres...DIS', fontsize =14, color = 'green' )
plt.ylabel( 'Level of Pollutants Nitric Oxide...NOX', fontsize =14, color = 'green' )
plt.show()
sns.set()
sns.set_style('dark')
sns.jointplot(x=data['DIS'], y =data['NOX'], color = 'red', joint_kws = {'alpha' : 0.5},)
plt.show()
sns.set() # to reset any previous styling
sns.set_style('dark')# to set style.
plt.figure(figsize = (16, 8))
sns.scatterplot(x=data['DIS'], y =data['NOX'], color = 'red', alpha = 0.5) # joint plot creates scatter plot
plt.show()
sns.set()
sns.set_style('dark')
sns.jointplot(x=data['DIS'], y =data['NOX'], kind = 'hex' ) # kind is hex, darker hexagons in regions of higher density
plt.show()
sns.set() # to reset any previous styling
sns.set_style('dark')# to set style.
sns.jointplot(x=data['TAX'], y =data['RAD'], color = 'darkred', joint_kws = {'alpha' : 0.5} ) # jointplot gives scatter plot
plt.show()
sns.lmplot(x ='TAX', y = 'RAD', data = data)
plt.show()
sns.set() # to reset any previous styling
sns.set_style('dark')# to set style.
# height = 7 is optional
sns.jointplot(x=data['RM'], y =data['PRICE'], color = 'indigo', kind = 'scatter', height = 7) # jointplot gives scatter plot
plt.show()
sns.lmplot(x ='RM', y = 'PRICE', data = data, height =10, aspect = 2)
plt.show()
sns.lmplot(x ='RM', y = 'PRICE', data = data, height =10, aspect = 2, ci = None)
plt.show()
rm_price_corr = round(data['RM'].corr(data['PRICE']), 3)#calculating the corelation b/n NOX and DIS, also rounding to 3 decimals
plt.figure(figsize = (16, 8))
plt.title (f'Rooms vs Price , Corr ({rm_price_corr})',
fontsize = 25, color = 'green' )# using the f string notation to print the Correlatiin value in title
plt.scatter(x=data['RM'], y =data['PRICE'], alpha = 0.5, color = 'red', s= 80)
plt.xlabel( 'RM ....No of ROOMs ', fontsize =14, color = 'green' )
plt.ylabel( 'House price...PRICE', fontsize =14, color = 'green' )
plt.show()
# sns.pairplot(data)
# plt.show()
# sns.pairplot(data, kind = 'reg')
# plt.show()
prices = data['PRICE'] # Target
features = data.drop('PRICE', axis = 1 ) # Features
X_train, X_test, y_train, y_test = train_test_split(features, prices, test_size = 0.2, random_state = 10)
len(X_train)/len(features)
X_train.head()
regr = LinearRegression()
regr.fit(X_train, y_train)
np.round(regr.coef_, decimals =4)
regr.intercept_
pd.DataFrame(data = regr.coef_, index = X_train.columns, columns =['Coef'] )
regr.score(X_train, y_train)
regr.score(X_test, y_test)
## Normal Distribution
from scipy.stats import norm
plt.figure(figsize = (10,5))
x_axis = np.arange(-10, 10, 0.001)
# mean = 0, SD = 3
plt.plot(x_axis, norm.pdf(x_axis,0,3))
plt.show()
# Data Transformations
plt.figure(figsize= (16,8))
plt.hist(data['PRICE'], bins =50, ec = 'black', color = 'blue', alpha = 0.5)# bins 50...setting color by hexcode
plt.xlabel('Price in 1000\'s')
plt.ylabel('Nr of Houses')
plt.show()
data['PRICE'].skew()
y_log = np.log(data['PRICE'])
y_log.skew()
sns.set()
plt.figure(figsize=(20,6))
sns.distplot(y_log)
plt.title(f'Log price: The skew is {round(y_log.skew(),2)}', fontsize = 25)# note the f string used to print the y_log.skew()
plt.xlabel('LOG PRICES ', fontsize =14, color = 'green' )
plt.show()
transformed_data = features # new data frame
transformed_data['LOG_PRICE'] = y_log # added the log price to the new data frame called transformed data
transformed_data
sns.lmplot(x='LSTAT', y='PRICE', data=data, height=10, aspect =2,
scatter_kws={'alpha': 0.6}, line_kws={'color':'darkred'})
plt.show()
sns.lmplot(x='LSTAT', y='LOG_PRICE', data=transformed_data, height =10,aspect = 2,
scatter_kws={'alpha': 0.6}, line_kws={'color':'blue'})
plt.show()
prices = np.log(data['PRICE']) # Use log prices
features = data.drop('PRICE', axis=1)
X_train, X_test, y_train, y_test = train_test_split(features, prices,
test_size=0.2, random_state=10)
regr = LinearRegression()
regr.fit(X_train, y_train)
print('Training data r-squared:', regr.score(X_train, y_train))
print('Test data r-squared:', regr.score(X_test, y_test))
pd.DataFrame(data=regr.coef_, index=X_train.columns, columns=['coef'])
print('Intercept', regr.intercept_)
# Evaluating P values using statsmodel....and also the coefficients using the stats model
#------------------------------------------Comments-----------------------------------------------------------------------------
# 1. Using statsmodel.... please import ....import statsmodels.api as sm.....in the uppermost cell
# 2. Using statsmodel we can Evaluate, the p-values and coefficients
# 3. OLS stands for ORDINARY LEAST SQUARES, gives a linear regression model
# 4. We pass the y_train and X_train, into sm.OLS
# 5. We have to use sm.add_constant(X_train),only then we will be able to get the value of the constant(intercept)
# 6. We then create a pandas DataFrame, to hold the Coefficients and P-values
# 7. Deprecation warnings if any, maybe ignored or acted upon as in warnings, see import statement topmost cell
#------------------------------------------------------------------------------------------------------------------------------#
#results = sm.OLS(y_train, X_train.assign(const=1)).fit()
results = sm.OLS(y_train, sm.add_constant(X_train)).fit()# obtaining the regression coeffs including the constant(intercept).
print(results.params, '\n')# printing the values of the coefficients including the consstant(intercept)
print(results.pvalues,'\n')# printing the p values
pd.DataFrame({'coef': results.params, 'p-value': round(results.pvalues, 3)})# created DataFrame,p-values rounded to 3 decimals
# any p value more than 0.05 is not significant, look at the INDUS feature and AGE feature....
# Multicollinearity
# 1. import.....from statsmodels.stats.outliers_influence import variance_inflation_factor
# 2. exog = using X_train with added constant.and getting the values from it.
# 3. sm.add_constant(x_train) will give a DataFrame
# 4. If you want to check do check (type(sm.add_constant(X_train)))
# 4. exog has to be a ndarray
# 5. Hence we used the .values, to passs only the values from the DataFrame.
# 3. exog.idx = index of the feature that we need to test
variance_inflation_factor(exog=sm.add_constant(X_train).values, exog_idx=10)# TAX is at index 10
X_incl_const = sm.add_constant(X_train)
X_incl_const.shape[1]
vif = []
for i in range(X_incl_const.shape[1]):
vif.append(variance_inflation_factor(exog=sm.add_constant(X_train).values, exog_idx=i))
print(vif)
pd.DataFrame({'coef_name': X_incl_const.columns, 'vif': np.around(vif, 2)})
X_incl_const = sm.add_constant(X_train)
model = sm.OLS(y_train, X_incl_const)
results_1 = model.fit()
org_coef = pd.DataFrame({'coef': results_1.params, 'p-value': round(results_1.pvalues, 3)})
print('BIC with all features and the log price is :' , results_1.bic, '\n')
print('r-squared is with all features and log price is :' , results_1.rsquared, '\n')
print(org_coef)
X_incl_const = sm.add_constant(X_train)
X_incl_const = X_incl_const.drop(['INDUS'], axis =1)
model = sm.OLS(y_train, X_incl_const)
results_2 = model.fit()
coef_minus_indus = pd.DataFrame({'coef': results_2.params, 'p-value': round(results_2.pvalues, 3)})
print('BIC without INDUS features and the log price is :' , results_2.bic, '\n')
print('r-squared without INDUS feature and log price is :' , results_2.rsquared, '\n')
print(coef_minus_indus)
X_incl_const = sm.add_constant(X_train)
X_incl_const = X_incl_const.drop(['INDUS', 'AGE'], axis =1)
model = sm.OLS(y_train, X_incl_const)
results_3 = model.fit()
coef_minus_indus_age = pd.DataFrame({'coef': results_3.params, 'p-value': round(results_3.pvalues, 3)})
print('BIC without INDUS features and the log price is :' , results_3.bic, '\n')
print('r-squared without INDUS feature and log price is :' , results_3.rsquared, '\n')
print(coef_minus_indus_age)
X_incl_const
X_incl_const = sm.add_constant(X_train)
X_incl_const
X_incl_const.shape
X_train.shape
plt.figure(figsize=(20,6))
plt.scatter(x=results_3.fittedvalues, y=results_3.resid, c='brown', alpha=0.8)
plt.xlabel('Predicted log prices $\hat y _i$', fontsize=20)
plt.ylabel('Residuals', fontsize=20)
plt.title('Residuals vs Fitted Values of the reduced log model', fontsize=20)
plt.show()
# Mean Squared Error & R-Squared
reduced_log_mse_minus_indus_minus_age = round(results_3.mse_resid, 3)# for later use and comparision
reduced_log_rsquared_minus_indus_minus_age = round(results_3.rsquared, 3) # for later use and comparision
X_incl_const = sm.add_constant(X_train)
X_incl_const = X_incl_const.drop(['INDUS', 'AGE','LSTAT', 'RM', 'NOX', 'CRIM' ], axis =1)
model = sm.OLS(y_train, X_incl_const)
results_4 = model.fit()
coef_minus_many_variables = pd.DataFrame({'coef': results_4.params, 'p-value': round(results_4.pvalues, 3)})
print('BIC after dropping many variables and the log price is :' , results_4.bic, '\n')
print('r-squared after dropping many variables and log price is :' , results_4.rsquared, '\n')
print(coef_minus_many_variables)
plt.figure(figsize=(20,6))
plt.scatter(x=results_4.fittedvalues, y=results_4.resid, c='brown', alpha=0.8)
plt.xlabel('Predicted log prices after omitting many variables $\hat y _i$', fontsize=20)
plt.ylabel('Residuals', fontsize=20)
plt.title('Residuals vs Fitted Values after ommitting many variables', fontsize=20)
plt.show()
# Mean Squared Error & R-Squared
ommited_var_mse = round(results_4.mse_resid, 3)# for later use and comparision
ommitted_var_rsquared = round(results_4.rsquared, 3) # for later use and comparision
resid_mean = round(results_3.resid.mean(), 3)
resid_mean
resid_skew = round(results_3.resid.skew(), 3)
resid_skew
plt.figure(figsize= (20,6))
sns.distplot(results_3.resid, color='navy')
plt.title(f'Residual Dist of Log price model w/o INDUS and AGE:residuals Skew ({resid_skew}) Mean ({resid_mean})',fontsize = 20)
plt.show()
frames = [org_coef, coef_minus_indus, coef_minus_indus_age, coef_minus_many_variables]
pd.concat(frames, axis=1, sort = False)
print(f'BIC WITH ALL:{round(results_1.bic)}:W/O INDUS {round(results_2.bic)}:W/O INDUS AND AGE {round(results_3.bic)}:W/O MANY VARIABLES {round(results_4.bic)}')
print(f'RSQD:WITH ALL {round((results_1.rsquared),2)}: W/O INDUS {round((results_2.rsquared),2)}: W/O INDUS,AGE {round((results_3.rsquared),2)}:W/O MANY VARIABLES {round((results_4.rsquared),2)}')
# Our Estimate for a house is 30000....calculate the upper bound and the lower bound using the log model_minus_indus_minus_age
sd_1 = round(np.sqrt(results_3.mse_resid),3)
sd_2 = 2*(round(np.sqrt(results_3.mse_resid),3))
print('1 s.d. in log prices is', sd_1)
print('2 s.d. in log prices is', sd_2)
upper_bound = np.log(30) + sd_2
print('The upper bound in log prices for a 95% prediction interval is ', upper_bound)
print('The upper bound in normal prices is $', np.e**upper_bound * 1000)
lower_bound = np.log(30) - sd_2
print('The lower bound in log prices for a 95% prediction interval is ', lower_bound)
print('The lower bound in normal prices is $', np.e**lower_bound * 1000)
data
house_prices = data['PRICE']
house_features = data.drop(['PRICE'], axis =1)
X_train, X_test, y_train, y_test = train_test_split(house_features, house_prices,
test_size=0.2, random_state=10)
X_incl_const_to_house_features = sm.add_constant(X_train)
model = sm.OLS(y_train, X_incl_const_to_house_features)
results_5 = model.fit()
round(results_5.params,3)
results_5.bic
results_5.rsquared
round(results_5.pvalues,2)
plt.figure(figsize=(20,6))
plt.scatter(x=results_5.fittedvalues, y=results_5.resid, c='brown', alpha=0.8)
plt.xlabel('Predicted Original prices $\hat y _i$', fontsize=20)
plt.ylabel('Residuals', fontsize=20)
plt.title('Residuals vs Fitted Values of the original model', fontsize=20)
plt.show()
# Mean Squared Error & R-Squared
original_mse = round(results_5.mse_resid, 3)# for later use and comparision
original_rsquared = round(results_5.rsquared, 3) # for later use and comparision
plt.figure(figsize= (20,6))
sns.distplot(results_5.resid, color='navy')
plt.title(f'Residual Dist Dollar price model)',fontsize = 20)
plt.show()
print(f'Skew:({round(results_5.resid.skew(),2)}), residuals mean : ({round(results_5.resid.mean(),2)}))')